{
/* NB. There are 512 8-byte entries per GDT page. */
unsigned int i, j, nr_pages = (entries + 511) / 512;
- unsigned long pfn, *gdt_page, flags;
+ unsigned long pfn, *gdt_page;
long ret = -EINVAL;
struct pfn_info *page;
struct desc_struct *vgdt;
- spin_lock_irqsave(&p->page_lock, flags);
+ spin_lock(&p->page_lock);
/* Check the new GDT. */
for ( i = 0; i < nr_pages; i++ )
ret = 0; /* success */
out:
- spin_unlock_irqrestore(&p->page_lock, flags);
+ spin_unlock(&p->page_lock);
return ret;
}
long do_update_descriptor(
unsigned long pa, unsigned long word1, unsigned long word2)
{
- unsigned long *gdt_pent, flags, pfn = pa >> PAGE_SHIFT;
+ unsigned long *gdt_pent, pfn = pa >> PAGE_SHIFT;
struct pfn_info *page;
long ret = -EINVAL;
if ( (pa & 7) || (pfn >= max_page) || !check_descriptor(word1, word2) )
return -EINVAL;
- spin_lock_irqsave(¤t->page_lock, flags);
+ spin_lock(¤t->page_lock);
page = frame_table + pfn;
if ( (page->flags & PG_domain_mask) != current->domain )
ret = 0; /* success */
out:
- spin_unlock_irqrestore(¤t->page_lock, flags);
+ spin_unlock(¤t->page_lock);
return ret;
}
#include <xeno/sched.h>
#include <xeno/errno.h>
#include <xeno/perfc.h>
+#include <xeno/interrupt.h>
#include <asm/page.h>
#include <asm/flushtlb.h>
#include <asm/io.h>
{
struct task_struct *p = current;
unsigned long addr = p->mm.ldt_base + (off << PAGE_SHIFT);
- unsigned long l1e, *ldt_page, flags;
+ unsigned long l1e, *ldt_page;
struct pfn_info *page;
int i, ret = -1;
- spin_lock_irqsave(&p->page_lock, flags);
+ /* We cannot take a page_lock in interrupt context. */
+ if ( in_interrupt() )
+ BUG();
+
+ spin_lock(&p->page_lock);
__get_user(l1e, (unsigned long *)(linear_pg_table+(addr>>PAGE_SHIFT)));
if ( unlikely(!(l1e & _PAGE_PRESENT)) )
ret = 0;
out:
- spin_unlock_irqrestore(&p->page_lock, flags);
+ spin_unlock(&p->page_lock);
return ret;
}
err = 1;
- spin_lock_irq(¤t->page_lock);
+ spin_lock(¤t->page_lock);
/* Get the page-frame number that a non-extended command references. */
if ( (cmd == MMU_NORMAL_PT_UPDATE) ||
}
unlock:
- spin_unlock_irq(¤t->page_lock);
+ spin_unlock(¤t->page_lock);
if ( unlikely(err) )
{
if ( unlikely(page_nr >= (HYPERVISOR_VIRT_START >> PAGE_SHIFT)) )
goto out;
- spin_lock_irq(&p->page_lock);
+ spin_lock(&p->page_lock);
/* Check that the VA's page-directory entry is present.. */
if ( unlikely((err = __get_user(_x, (unsigned long *)
if ( unlikely(cr0 != 0) )
write_cr0(cr0);
unlock_and_out:
- spin_unlock_irq(&p->page_lock);
+ spin_unlock(&p->page_lock);
out:
return err;
}
static kmem_cache_t *buffer_head_cachep;
static atomic_t nr_pending;
+static struct buffer_head *completed_bhs[NR_CPUS] __cacheline_aligned;
+
static int __buffer_is_valid(struct task_struct *p,
unsigned long buffer,
unsigned short size,
/******************************************************************
* COMPLETION CALLBACK -- Called as bh->b_end_io()
- * NB. This can be called from interrupt context!
*/
+static void end_block_io_op_softirq(struct softirq_action *h)
+{
+ pending_req_t *pending_req;
+ struct buffer_head *bh, *nbh;
+ unsigned int cpu = smp_processor_id();
+
+ local_irq_disable();
+ bh = completed_bhs[cpu];
+ completed_bhs[cpu] = NULL;
+ local_irq_enable();
+
+ while ( bh != NULL )
+ {
+ pending_req = bh->pending_req;
+
+ unlock_buffer(pending_req->domain,
+ virt_to_phys(bh->b_data),
+ bh->b_size,
+ (pending_req->operation==READ));
+
+ if ( atomic_dec_and_test(&pending_req->pendcnt) )
+ {
+ make_response(pending_req->domain, pending_req->id,
+ pending_req->operation, pending_req->status);
+ put_task_struct(pending_req->domain);
+ spin_lock(&pend_prod_lock);
+ pending_ring[pending_prod] = pending_req - pending_reqs;
+ PENDREQ_IDX_INC(pending_prod);
+ spin_unlock(&pend_prod_lock);
+ atomic_dec(&nr_pending);
+ maybe_trigger_io_schedule();
+ }
+
+ nbh = bh->b_reqnext;
+ kmem_cache_free(buffer_head_cachep, bh);
+ bh = nbh;
+ }
+}
+
static void end_block_io_op(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
- pending_req_t *pending_req = bh->pending_req;
+ unsigned int cpu = smp_processor_id();
/* An error fails the entire request. */
if ( !uptodate )
{
DPRINTK("Buffer not up-to-date at end of operation\n");
- pending_req->status = 2;
+ bh->pending_req->status = 2;
}
- unlock_buffer(pending_req->domain,
- virt_to_phys(bh->b_data),
- bh->b_size,
- (pending_req->operation==READ));
-
- if ( atomic_dec_and_test(&pending_req->pendcnt) )
- {
- make_response(pending_req->domain, pending_req->id,
- pending_req->operation, pending_req->status);
- put_task_struct(pending_req->domain);
- spin_lock_irqsave(&pend_prod_lock, flags);
- pending_ring[pending_prod] = pending_req - pending_reqs;
- PENDREQ_IDX_INC(pending_prod);
- spin_unlock_irqrestore(&pend_prod_lock, flags);
- atomic_dec(&nr_pending);
- maybe_trigger_io_schedule();
- }
+ local_irq_save(flags);
+ bh->b_reqnext = completed_bhs[cpu];
+ completed_bhs[cpu] = bh;
+ local_irq_restore(flags);
- kmem_cache_free(buffer_head_cachep, bh);
+ __cpu_raise_softirq(cpu, BLKDEV_RESPONSE_SOFTIRQ);
}
+
+
/* ----[ Syscall Interface ]------------------------------------------------*/
long do_block_io_op(block_io_op_t *u_block_io_op)
unsigned short size,
int writeable_buffer)
{
- unsigned long pfn, flags;
+ unsigned long pfn;
struct pfn_info *page;
- spin_lock_irqsave(&p->page_lock, flags);
+ spin_lock(&p->page_lock);
for ( pfn = buffer >> PAGE_SHIFT;
pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
pfn++ )
put_page_type(page);
put_page_tot(page);
}
- spin_unlock_irqrestore(&p->page_lock, flags);
+ spin_unlock(&p->page_lock);
}
static int do_block_io_op_domain(struct task_struct *p, int max_to_do)
struct buffer_head *bh;
int operation = (req->operation == XEN_BLOCK_WRITE) ? WRITE : READ;
unsigned short nr_sects;
- unsigned long buffer, flags;
+ unsigned long buffer;
int i, tot_sects;
pending_req_t *pending_req;
int new_segs, nr_psegs = 0;
phys_seg_t phys_seg[MAX_BLK_SEGS * 2];
- spin_lock_irqsave(&p->page_lock, flags);
+ spin_lock(&p->page_lock);
/* Check that number of segments is sane. */
if ( (req->nr_segments == 0) || (req->nr_segments > MAX_BLK_SEGS) )
for ( i = 0; i < nr_psegs; i++ )
__lock_buffer(phys_seg[i].buffer, phys_seg[i].nr_sects<<9,
(operation==READ));
- spin_unlock_irqrestore(&p->page_lock, flags);
+ spin_unlock(&p->page_lock);
atomic_inc(&nr_pending);
pending_req = pending_reqs + pending_ring[pending_cons];
return;
bad_descriptor:
- spin_unlock_irqrestore(&p->page_lock, flags);
+ spin_unlock(&p->page_lock);
make_response(p, req->id, req->operation, 1);
}
static void make_response(struct task_struct *p, unsigned long id,
unsigned short op, unsigned long st)
{
- unsigned long cpu_mask, flags;
+ unsigned long cpu_mask;
int position;
blk_ring_t *blk_ring;
/* Place on the response ring for the relevant domain. */
- spin_lock_irqsave(&p->blk_ring_lock, flags);
+ spin_lock(&p->blk_ring_lock);
blk_ring = p->blk_ring_base;
position = p->blk_resp_prod;
blk_ring->ring[position].resp.id = id;
blk_ring->ring[position].resp.operation = op;
blk_ring->ring[position].resp.status = st;
p->blk_resp_prod = blk_ring->resp_prod = BLK_RING_INC(position);
- spin_unlock_irqrestore(&p->blk_ring_lock, flags);
+ spin_unlock(&p->blk_ring_lock);
/* Kick the relevant domain. */
cpu_mask = mark_guest_event(p, _EVENT_BLKDEV);
atomic_set(&nr_pending, 0);
pending_prod = pending_cons = 0;
memset(pending_reqs, 0, sizeof(pending_reqs));
- for ( i = 0; i < MAX_PENDING_REQS; i++ ) pending_ring[i] = i;
+ for ( i = 0; i < MAX_PENDING_REQS; i++ )
+ pending_ring[i] = i;
+
+ for ( i = 0; i < NR_CPUS; i++ )
+ completed_bhs[i] = NULL;
+
+ open_softirq(BLKDEV_RESPONSE_SOFTIRQ, end_block_io_op_softirq, NULL);
spin_lock_init(&io_schedule_list_lock);
INIT_LIST_HEAD(&io_schedule_list);
};
-/* Who gets which entry in bh_base. Things which will occur most often
- should come first */
-
enum {
TIMER_BH = 0,
- TQUEUE_BH,
- SCSI_BH,
- IMMEDIATE_BH
+ SCSI_BH
};
#include <asm/hardirq.h>
#include <asm/softirq.h>
-
-/* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
- frequency threaded job scheduling. For almost all the purposes
- tasklets are more than enough. F.e. all serial device BHs et
- al. should be converted to tasklets, not to softirqs.
- */
-
enum
{
HI_SOFTIRQ=0,
NET_RX_SOFTIRQ,
AC_TIMER_SOFTIRQ,
- TASKLET_SOFTIRQ
+ TASKLET_SOFTIRQ,
+ BLKDEV_RESPONSE_SOFTIRQ,
+ NET_TX_SOFTIRQ
};
/* softirq mask and active fields moved to irq_cpustat_t in
struct vlan_group;
+extern struct skb_completion_queues {
+ struct sk_buff *rx; /* Packets received in interrupt context. */
+ unsigned int rx_qlen;
+ struct sk_buff *tx; /* Tx buffers defunct in interrupt context. */
+} skb_queue[NR_CPUS] __cacheline_aligned;
+
/* Backlog congestion levels */
#define NET_RX_SUCCESS 0 /* keep 'em coming, baby */
#define NET_RX_DROP 1 /* packet dropped */
}
-/*
- * Xen does not need deferred skb freeing, as all destructor hook functions
- * are IRQ safe. Linux needed more care for some destructors...
+/* Use this variant when it is known for sure that it
+ * is executing from interrupt context.
+ */
+static inline void dev_kfree_skb_irq(struct sk_buff *skb)
+{
+ int cpu = smp_processor_id();
+ unsigned long flags;
+ local_irq_save(flags);
+ skb->next = skb_queue[cpu].tx;
+ skb_queue[cpu].tx = skb;
+ __cpu_raise_softirq(cpu, NET_TX_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+/* Use this variant in places where it could be invoked
+ * either from interrupt or non-interrupt context.
*/
-#define dev_kfree_skb_irq(_skb) dev_kfree_skb(_skb)
-#define dev_kfree_skb_any(_skb) dev_kfree_skb(_skb)
+static inline void dev_kfree_skb_any(struct sk_buff *skb)
+{
+ if (in_irq())
+ dev_kfree_skb_irq(skb);
+ else
+ dev_kfree_skb(skb);
+}
extern void net_call_rx_atomic(void (*fn)(void));
extern int netif_rx(struct sk_buff *skb);
#define TX_RING_ADD(_i,_j) (((_i)+(_j)) & (TX_RING_SIZE-1))
#define RX_RING_ADD(_i,_j) (((_i)+(_j)) & (RX_RING_SIZE-1))
-static struct sk_buff_head rx_skb_queue[NR_CPUS] __cacheline_aligned;
+struct skb_completion_queues skb_queue[NR_CPUS] __cacheline_aligned;
static int get_tx_bufs(net_vif_t *vif);
int netif_rx(struct sk_buff *skb)
{
- int this_cpu = smp_processor_id();
- struct sk_buff_head *q = &rx_skb_queue[this_cpu];
+ int cpu = smp_processor_id();
unsigned long flags;
- /* This oughtn't to happen, really! */
- if ( unlikely(skb_queue_len(q) > 100) )
+ local_irq_save(flags);
+
+ if ( unlikely(skb_queue[cpu].rx_qlen > 100) )
{
+ local_irq_restore(flags);
perfc_incr(net_rx_congestion_drop);
return NET_RX_DROP;
}
- local_irq_save(flags);
- __skb_queue_tail(q, skb);
+ skb->next = skb_queue[cpu].rx;
+ skb_queue[cpu].rx = skb;
+
local_irq_restore(flags);
- __cpu_raise_softirq(this_cpu, NET_RX_SOFTIRQ);
+ __cpu_raise_softirq(cpu, NET_RX_SOFTIRQ);
return NET_RX_SUCCESS;
}
static void net_rx_action(struct softirq_action *h)
{
- int offset, this_cpu = smp_processor_id();
- struct sk_buff_head *q = &rx_skb_queue[this_cpu];
- struct sk_buff *skb;
+ int offset, cpu = smp_processor_id();
+ struct sk_buff *skb, *nskb;
local_irq_disable();
-
- while ( (skb = __skb_dequeue(q)) != NULL )
+ skb = skb_queue[cpu].rx;
+ skb_queue[cpu].rx = NULL;
+ skb_queue[cpu].rx_qlen = 0;
+ local_irq_enable();
+
+ while ( skb != NULL )
{
ASSERT(skb->skb_type == SKB_ZERO_COPY);
skb_push(skb, ETH_HLEN);
skb->mac.raw = skb->data;
- netdev_rx_stat[this_cpu].total++;
+ netdev_rx_stat[cpu].total++;
if ( skb->dst_vif == NULL )
skb->dst_vif = net_get_target_vif(
}
unmap_domain_mem(skb->head);
+
+ nskb = skb->next;
kfree_skb(skb);
+ skb = nskb;
}
-
- local_irq_enable();
}
}
+static void net_tx_gc(struct softirq_action *h)
+{
+ int cpu = smp_processor_id();
+ struct sk_buff *skb, *nskb;
+
+ local_irq_disable();
+ skb = skb_queue[cpu].tx;
+ skb_queue[cpu].tx = NULL;
+ local_irq_enable();
+
+ while ( skb != NULL )
+ {
+ nskb = skb->next;
+ __kfree_skb(skb);
+ skb = nskb;
+ }
+}
+
/* Destructor function for tx skbs. */
static void tx_skb_release(struct sk_buff *skb)
{
int i;
- net_vif_t *vif = skb->src_vif;
- unsigned long flags;
+ net_vif_t *vif;
+
+ vif = skb->src_vif;
- spin_lock_irqsave(&vif->domain->page_lock, flags);
+ spin_lock(&vif->domain->page_lock);
for ( i = 0; i < skb_shinfo(skb)->nr_frags; i++ )
put_page_tot(skb_shinfo(skb)->frags[i].page);
- spin_unlock_irqrestore(&vif->domain->page_lock, flags);
-
+ spin_unlock(&vif->domain->page_lock);
+
if ( skb->skb_type == SKB_NODATA )
kmem_cache_free(net_header_cachep, skb->head);
-
+
skb_shinfo(skb)->nr_frags = 0;
-
- spin_lock_irqsave(&vif->tx_lock, flags);
+
+ spin_lock(&vif->tx_lock);
__make_tx_response(vif, skb->guest_id, RING_STATUS_OK);
- spin_unlock_irqrestore(&vif->tx_lock, flags);
-
+ spin_unlock(&vif->tx_lock);
+
/*
- * Checks below must happen after the above response is posted.
- * This avoids a possible race with a guest OS on another CPU.
+ * Checks below must happen after the above response is posted. This avoids
+ * a possible race with a guest OS on another CPU.
*/
smp_mb();
-
+
if ( (vif->tx_cons == vif->tx_prod) && get_tx_bufs(vif) )
{
add_to_net_schedule_list_tail(vif);
maybe_schedule_tx_action();
}
-
+
put_vif(vif);
}
struct sk_buff *skb;
tx_req_entry_t tx;
int i, j, ret = 0;
- unsigned long flags;
if ( vif->tx_req_cons == shared_idxs->tx_req_prod )
return 0;
- spin_lock_irqsave(&vif->tx_lock, flags);
+ spin_lock(&vif->tx_lock);
/* Currently waiting for more credit? */
if ( vif->remaining_credit == 0 )
vif->tx_prod = j;
out:
- spin_unlock_irqrestore(&vif->tx_lock, flags);
+ spin_unlock(&vif->tx_lock);
return ret;
}
pte_pfn = rx.addr >> PAGE_SHIFT;
pte_page = frame_table + pte_pfn;
- spin_lock_irq(&p->page_lock);
+ spin_lock(&p->page_lock);
if ( (pte_pfn >= max_page) ||
((pte_page->flags & (PG_type_mask | PG_domain_mask)) !=
(PGT_l1_page_table | p->domain)) )
{
DPRINTK("Bad page frame for ppte %d,%08lx,%08lx,%08lx\n",
p->domain, pte_pfn, max_page, pte_page->flags);
- spin_unlock_irq(&p->page_lock);
+ spin_unlock(&p->page_lock);
make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
continue;
}
rx_unmap_and_continue:
unmap_domain_mem(ptep);
- spin_unlock_irq(&p->page_lock);
+ spin_unlock(&p->page_lock);
}
vif->rx_req_cons = i;
long flush_bufs_for_vif(net_vif_t *vif)
{
int i;
- unsigned long *pte, flags;
+ unsigned long *pte;
struct pfn_info *page;
struct task_struct *p = vif->domain;
rx_shadow_entry_t *rx;
net_idx_t *shared_idxs = vif->shared_idxs;
/* Return any outstanding receive buffers to the guest OS. */
- spin_lock_irqsave(&p->page_lock, flags);
+ spin_lock(&p->page_lock);
for ( i = vif->rx_req_cons;
(i != shared_idxs->rx_req_prod) &&
(((vif->rx_resp_prod-i) & (RX_RING_SIZE-1)) != 1);
make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
}
vif->rx_cons = i;
- spin_unlock_irqrestore(&p->page_lock, flags);
+ spin_unlock(&p->page_lock);
/*
* Flush pending transmit buffers. The guest may still have to wait for
* buffers that are queued at a physical NIC.
*/
- spin_lock_irqsave(&vif->tx_lock, flags);
+ spin_lock(&vif->tx_lock);
for ( i = vif->tx_req_cons;
(i != shared_idxs->tx_req_prod) &&
(((vif->tx_resp_prod-i) & (TX_RING_SIZE-1)) != 1);
RING_STATUS_DROPPED);
}
vif->tx_req_cons = i;
- spin_unlock_irqrestore(&vif->tx_lock, flags);
+ spin_unlock(&vif->tx_lock);
return 0;
}
case NETOP_RESET_RINGS:
/* We take the tx_lock to avoid a race with get_tx_bufs. */
- spin_lock_irq(&vif->tx_lock);
+ spin_lock(&vif->tx_lock);
if ( (vif->rx_req_cons != vif->rx_resp_prod) ||
(vif->tx_req_cons != vif->tx_resp_prod) )
{
vif->tx_req_cons = vif->tx_resp_prod = 0;
ret = 0;
}
- spin_unlock_irq(&vif->tx_lock);
+ spin_unlock(&vif->tx_lock);
break;
case NETOP_GET_VIF_INFO:
unsigned char st,
unsigned char off)
{
- unsigned long flags;
unsigned int pos;
rx_resp_entry_t *resp;
/* Place on the response ring for the relevant domain. */
- spin_lock_irqsave(&vif->rx_lock, flags);
+ spin_lock(&vif->rx_lock);
pos = vif->rx_resp_prod;
resp = &vif->shared_rings->rx_ring[pos].resp;
resp->id = id;
unsigned long cpu_mask = mark_guest_event(vif->domain, _EVENT_NET);
guest_event_notify(cpu_mask);
}
- spin_unlock_irqrestore(&vif->rx_lock, flags);
+ spin_unlock(&vif->rx_lock);
}
int setup_network_devices(void)
{
- int i, ret;
+ int ret;
extern char opt_ifname[];
- for ( i = 0; i < smp_num_cpus; i++ )
- skb_queue_head_init(&rx_skb_queue[i]);
+ memset(skb_queue, 0, sizeof(skb_queue));
+ /* Actual receive processing happens in softirq context. */
open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
+
+ /* Processing of defunct transmit buffers happens in softirq context. */
+ open_softirq(NET_TX_SOFTIRQ, net_tx_gc, NULL);
+
+ /* Tranmit scheduling happens in a tasklet to exclude other processors. */
tasklet_enable(&net_tx_tasklet);
if ( (the_dev = dev_get_by_name(opt_ifname)) == NULL )